#import important liberies
import pandas as pd
import numpy as np
#visualisation liberies
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
#.rdata file reader
import pyreadr
#zipfile extract
import zipfile as zp
#sklearn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, KFold, train_test_split
# # Import the required libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
Converting .rData into dataframe
# #extract from zipfile
# with zp.ZipFile("dataset\TEP_FaultFree_Training.RData (1).zip", 'r') as zip_ref:
# zip_ref.extractall("dataset")
# with zp.ZipFile("dataset/TEP_Faulty_Training.RData.zip", 'r') as zip_ref:
# zip_ref.extractall("dataset")
# # pyreadr converts rData into dictionary type object.
# result = pyreadr.read_r("dataset/TEP_Faulty_Training.RData")
# result1 = pyreadr.read_r("dataset/TEP_FaultFree_Training.RData")
# result2 = pyreadr.read_r("/kaggle/input/tennessee-eastman-process-simulation-dataset/TEP_FaultFree_Testing.RData")
# result3 = pyreadr.read_r("/kaggle/input/tennessee-eastman-process-simulation-dataset/TEP_Faulty_Testing.RData")
# #dataframes
# df_train = result['faulty_training']
# df_ff = result1['fault_free_training']
# df_test = result2['fault_free_testing']
# df_test_ff = result3['faulty_testing']
# #save in csv file
# df_ff.to_csv("dataset/fault_free_training.csv")
# df_train.to_csv("dataset/faulty_training.csv")
df_ff = pd.read_csv("dataset/fault_free_training.csv",index_col="Unnamed: 0")
df_faulty = pd.read_csv("dataset/faulty_training.csv",index_col="Unnamed: 0")
df = pd.concat([df_faulty,df_ff])
# dft = pd.concat([df_test,df_test_ff])
print("len df:",len(df))
# print("len dft:",len(dft))
df.head()
len df: 5250000
| faultNumber | simulationRun | sample | xmeas_1 | xmeas_2 | xmeas_3 | xmeas_4 | xmeas_5 | xmeas_6 | xmeas_7 | ... | xmv_2 | xmv_3 | xmv_4 | xmv_5 | xmv_6 | xmv_7 | xmv_8 | xmv_9 | xmv_10 | xmv_11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1 | 0.25038 | 3674.0 | 4529.0 | 9.2320 | 26.889 | 42.402 | 2704.3 | ... | 53.744 | 24.657 | 62.544 | 22.137 | 39.935 | 42.323 | 47.757 | 47.510 | 41.258 | 18.447 |
| 1 | 1.0 | 1.0 | 2 | 0.25109 | 3659.4 | 4556.6 | 9.4264 | 26.721 | 42.576 | 2705.0 | ... | 53.414 | 24.588 | 59.259 | 22.084 | 40.176 | 38.554 | 43.692 | 47.427 | 41.359 | 17.194 |
| 2 | 1.0 | 1.0 | 3 | 0.25038 | 3660.3 | 4477.8 | 9.4426 | 26.875 | 42.070 | 2706.2 | ... | 54.357 | 24.666 | 61.275 | 22.380 | 40.244 | 38.990 | 46.699 | 47.468 | 41.199 | 20.530 |
| 3 | 1.0 | 1.0 | 4 | 0.24977 | 3661.3 | 4512.1 | 9.4776 | 26.758 | 42.063 | 2707.2 | ... | 53.946 | 24.725 | 59.856 | 22.277 | 40.257 | 38.072 | 47.541 | 47.658 | 41.643 | 18.089 |
| 4 | 1.0 | 1.0 | 5 | 0.29405 | 3679.0 | 4497.0 | 9.3381 | 26.889 | 42.650 | 2705.1 | ... | 53.658 | 28.797 | 60.717 | 21.947 | 39.144 | 41.955 | 47.645 | 47.346 | 41.507 | 18.461 |
5 rows × 55 columns
| faultNumber | simulationRun | sample | xmeas_1 | xmeas_2 | xmeas_3 | xmeas_4 | xmeas_5 | xmeas_6 | xmeas_7 | ... | xmv_2 | xmv_3 | xmv_4 | xmv_5 | xmv_6 | xmv_7 | xmv_8 | xmv_9 | xmv_10 | xmv_11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 29 | 1.0 | 1.0 | 30 | 0.32790 | 3672.1 | 4498.5 | 9.3309 | 26.882 | 42.230 | 2731.1 | ... | 54.306 | 31.892 | 61.533 | 19.979 | 40.044 | 37.061 | 50.021 | 48.708 | 40.319 | 17.414 |
| 39 | 1.0 | 1.0 | 40 | 0.57525 | 3603.4 | 4487.5 | 9.1370 | 26.848 | 42.636 | 2783.8 | ... | 54.260 | 56.716 | 60.736 | 20.244 | 38.633 | 33.167 | 49.407 | 49.782 | 40.344 | 17.442 |
| 49 | 1.0 | 1.0 | 50 | 0.70478 | 3682.0 | 4521.7 | 8.9201 | 27.062 | 42.055 | 2806.5 | ... | 54.007 | 69.024 | 54.835 | 21.451 | 32.382 | 38.387 | 47.022 | 50.995 | 40.224 | 19.915 |
| 59 | 1.0 | 1.0 | 60 | 0.84370 | 3628.9 | 4534.6 | 8.4858 | 27.113 | 42.264 | 2796.1 | ... | 54.031 | 82.589 | 57.099 | 21.442 | 32.739 | 43.258 | 49.091 | 57.335 | 40.265 | 19.471 |
| 69 | 1.0 | 1.0 | 70 | 0.97465 | 3707.0 | 4478.6 | 8.3211 | 26.866 | 42.252 | 2750.6 | ... | 53.978 | 95.883 | 54.466 | 20.742 | 33.313 | 38.342 | 48.100 | 68.268 | 39.561 | 17.498 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 29 | 0.0 | 1.0 | 30 | 0.22400 | 3674.7 | 4499.5 | 9.3584 | 26.886 | 42.296 | 2696.9 | ... | 54.318 | 21.672 | 61.716 | 22.086 | 40.509 | 37.395 | 50.475 | 48.724 | 40.977 | 17.522 |
| 39 | 0.0 | 1.0 | 40 | 0.25421 | 3601.1 | 4486.7 | 9.3263 | 26.846 | 42.582 | 2697.6 | ... | 54.250 | 25.135 | 61.987 | 21.831 | 42.732 | 33.552 | 50.133 | 50.302 | 40.587 | 16.976 |
| 49 | 0.0 | 1.0 | 50 | 0.22461 | 3674.6 | 4498.1 | 9.4294 | 27.063 | 42.056 | 2696.3 | ... | 53.724 | 21.790 | 58.189 | 21.901 | 40.060 | 38.638 | 47.650 | 50.597 | 40.292 | 18.876 |
| 59 | 0.0 | 1.0 | 60 | 0.23528 | 3627.5 | 4508.4 | 9.3557 | 27.116 | 42.405 | 2701.2 | ... | 53.717 | 22.739 | 62.817 | 21.751 | 41.602 | 43.162 | 49.351 | 51.360 | 40.891 | 18.118 |
| 69 | 0.0 | 1.0 | 70 | 0.29211 | 3721.8 | 4493.8 | 9.4493 | 26.870 | 42.532 | 2701.6 | ... | 54.160 | 28.743 | 61.872 | 22.063 | 40.384 | 37.918 | 48.010 | 50.928 | 40.612 | 16.047 |
105 rows × 55 columns
# df_test = pd.DataFrame(data=["predict"],columns=["Action"])
# dft = pd.concat([df_test, df[(df["simulationRun"]==1) & ((df["sample"]==30) | (df["sample"]==40) | (df["sample"]==50) | (df["sample"]==60) | (df["sample"]==70))].iloc[:100,3:]])
# dft.fillna(method="ffill",inplace=True)
# # dft.reset_index()
# dft.dropna(inplace=True)
df.columns
Index(['faultNumber', 'simulationRun', 'sample', 'xmeas_1', 'xmeas_2',
'xmeas_3', 'xmeas_4', 'xmeas_5', 'xmeas_6', 'xmeas_7', 'xmeas_8',
'xmeas_9', 'xmeas_10', 'xmeas_11', 'xmeas_12', 'xmeas_13', 'xmeas_14',
'xmeas_15', 'xmeas_16', 'xmeas_17', 'xmeas_18', 'xmeas_19', 'xmeas_20',
'xmeas_21', 'xmeas_22', 'xmeas_23', 'xmeas_24', 'xmeas_25', 'xmeas_26',
'xmeas_27', 'xmeas_28', 'xmeas_29', 'xmeas_30', 'xmeas_31', 'xmeas_32',
'xmeas_33', 'xmeas_34', 'xmeas_35', 'xmeas_36', 'xmeas_37', 'xmeas_38',
'xmeas_39', 'xmeas_40', 'xmeas_41', 'xmv_1', 'xmv_2', 'xmv_3', 'xmv_4',
'xmv_5', 'xmv_6', 'xmv_7', 'xmv_8', 'xmv_9', 'xmv_10', 'xmv_11'],
dtype='object')
The TEP is comprised of 8 chemical components in total: 4 reactants, 2 products, 1 by-product and 1 inert component. These components undergo a chemical process dominated by 5 main process units: a reactor that allows for the reaction of the gaseous feed components (A, C, D and E) into liquid products (G and H), a condenser to cool-down the gaseous product stream coming out of the reactor, a gas-liquid separator to split gas and liquid components from the cooled product stream, a centrifugal compressor to flow this gas stream back into the reactor and a stripper to handle the efficient separation of the 2 products from any unreacted feed components. There is also a purge to remove the inert (B) and the by-product (F) from the system . An overview of the industrial process is given below.

X_dict = {
'XMEAS_1':'A_feed_stream',
'XMEAS_2':'D_feed_stream',
'XMEAS_3':'E_feed_stream',
'XMEAS_4':'Total_fresh_feed_stripper',
'XMEAS_5':'Recycle_flow_into_rxtr',
'XMEAS_6':'Reactor_feed_rate',
'XMEAS_7':'Reactor_pressure',
'XMEAS_8':'Reactor_level',
'XMEAS_9':'Reactor_temp',
'XMEAS_10':'Purge_rate',
'XMEAS_11':'Separator_temp',
'XMEAS_12':'Separator_level',
'XMEAS_13':'Separator_pressure',
'XMEAS_14':'Separator_underflow',
'XMEAS_15':'Stripper_level',
'XMEAS_16':'Stripper_pressure',
'XMEAS_17':'Stripper_underflow',
'XMEAS_18':'Stripper_temperature',
'XMEAS_19':'Stripper_steam_flow',
'XMEAS_20':'Compressor_work',
'XMEAS_21':'Reactor_cooling_water_outlet_temp',
'XMEAS_22':'Condenser_cooling_water_outlet_temp',
'XMEAS_23':'Composition_of_A_rxtr_feed',
'XMEAS_24':'Composition_of_B_rxtr_feed',
'XMEAS_25':'Composition_of_C_rxtr_feed',
'XMEAS_26':'Composition_of_D_rxtr_feed',
'XMEAS_27':'Composition_of_E_rxtr_feed',
'XMEAS_28':'Composition_of_F_rxtr_feed',
'XMEAS_29':'Composition_of_A_purge',
'XMEAS_30':'Composition_of_B_purge',
'XMEAS_31':'Composition_of_C_purge',
'XMEAS_32':'Composition_of_D_purge',
'XMEAS_33':'Composition_of_E_purge',
'XMEAS_34':'Composition_of_F_purge',
'XMEAS_35':'Composition_of_G_purge',
'XMEAS_36':'Composition_of_H_purge',
'XMEAS_37':'Composition_of_D_product',
'XMEAS_38':'Composition_of_E_product',
'XMEAS_39':'Composition_of_F_product',
'XMEAS_40':'Composition_of_G_product',
'XMEAS_41':'Composition_of_H_product',
'XMV_1':'D_feed_flow_valve',
'XMV_2':'E_feed_flow_valve',
'XMV_3':'A_feed_flow_valve',
'XMV_4':'Total_feed_flow_stripper_valve',
'XMV_5':'Compressor_recycle_valve',
'XMV_6':'Purge_valve',
'XMV_7':'Separator_pot_liquid_flow_valve',
'XMV_8':'Stripper_liquid_product_flow_valve',
'XMV_9':'Stripper_steam_valve',
'XMV_10':'Reactor_cooling_water_flow_valve',
'XMV_11':'Condenser_cooling_water_flow_valve',
'XMV_12':'Agitator_speed'
}
# df = df.rename(columns = lambda x:X_dict[x.upper()] if x.upper() in X_dict.keys() else x)
# dft = dft.rename(columns = lambda x:X_dict[x.upper()] if x.upper() in X_dict.keys() else x)
The TEP variables (columns 4 to 55) were sampled every 3 minutes for a total duration of 25 hours and 48 hours respectively. The faults were introduced 1 hour into the Faulty Training and 8 hours into Faulty Testing datasets
# pp= df[(df['faultNumber']==0) & (df['simulationRun']==1)]
# feature_columns = df.columns[3:]
# num_features = len(feature_columns)
# num_cols = min(4, num_features) # Maximum of 4 columns
# num_rows = int(np.ceil(num_features / num_cols))
# fig, axes = plt.subplots(num_rows, num_cols, figsize=(4 * num_cols, 3 * num_rows))
# for i, col in enumerate(feature_columns):
# row_index = i // num_cols
# col_index = i % num_cols
# ax = axes[row_index, col_index]
# ax.plot(pp['sample'], pp[col])
# ax.set_xlabel('sample no.')
# ax.set_ylabel(col)
# ax.set_title(col.replace("_"," "))
# for i in range(num_features, num_rows * num_cols):
# fig.delaxes(axes.flatten()[i])
# plt.tight_layout()
# plt.show()
# # col = 'A_feed_stream'
# col = 'xmeas_1'
# simRuns = np.arange(1, 500, 50)
# num_simRuns = len(simRuns)
# num_cols = min(3, num_simRuns) # Maximum of 2 columns
# num_rows = int(np.ceil(num_simRuns / num_cols))
# # Create subplots
# fig, axes = plt.subplots(num_rows, num_cols, figsize=(3 * num_cols, 2
# * num_rows))
# for i, simRun in enumerate(simRuns):
# pp = df[(df['faultNumber'] == 0) & (df['simulationRun'] == simRun)]
# row_index = i // num_cols
# col_index = i % num_cols
# ax = axes[row_index, col_index]
# ax.plot(pp['sample'], pp[col])
# ax.set_xlabel('sample no.')
# ax.set_ylabel(col)
# ax.set_title(f'Simulation run - {simRun}')
# # Remove any empty subplots
# for i in range(num_simRuns, num_rows * num_cols):
# fig.delaxes(axes.flatten()[i])
# plt.tight_layout()
# plt.show()
# feature_columns = df.columns[3:]
# num_features = len(feature_columns)
# num_cols = min(4, num_features) # Maximum of 4 columns
# num_rows = int(np.ceil(num_features / num_cols))
# fig, axes = plt.subplots(num_rows, num_cols, figsize=(3 * num_cols, 2* num_rows))
# for i, col in enumerate(feature_columns):
# row_index = i // num_cols
# col_index = i % num_cols
# ax = axes[row_index, col_index]
# ax.boxplot(pp[col])
# ax.set_xlabel('sample no.')
# ax.set_ylabel(col)
# ax.set_title(col.replace("_"," "))
# for i in range(num_features, num_rows * num_cols):
# fig.delaxes(axes.flatten()[i])
# plt.tight_layout()
# plt.show()
# df.iloc[:,3:].describe()
| xmeas_1 | xmeas_2 | xmeas_3 | xmeas_4 | xmeas_5 | xmeas_6 | xmeas_7 | xmeas_8 | xmeas_9 | xmeas_10 | ... | xmv_2 | xmv_3 | xmv_4 | xmv_5 | xmv_6 | xmv_7 | xmv_8 | xmv_9 | xmv_10 | xmv_11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | ... | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 | 5.250000e+06 |
| mean | 2.608409e-01 | 3.663777e+03 | 4.506178e+03 | 9.369238e+00 | 2.690159e+01 | 4.236293e+01 | 2.722150e+03 | 7.488799e+01 | 1.204002e+02 | 3.459646e-01 | ... | 5.430117e+01 | 3.015595e+01 | 6.315545e+01 | 2.291970e+01 | 3.992928e+01 | 3.807390e+01 | 4.644203e+01 | 5.047991e+01 | 4.190826e+01 | 1.880923e+01 |
| std | 1.461083e-01 | 4.277760e+01 | 1.086998e+02 | 3.563536e-01 | 2.310677e-01 | 3.132701e-01 | 7.427919e+01 | 1.315495e+00 | 7.127389e-02 | 8.397854e-02 | ... | 5.132918e+00 | 2.003898e+01 | 7.238756e+00 | 1.081725e+01 | 1.262621e+01 | 2.949913e+00 | 2.358218e+00 | 1.719376e+01 | 9.773339e+00 | 5.064388e+00 |
| min | -4.735000e-03 | 3.327400e+03 | 3.540700e+03 | 6.639900e+00 | 2.538600e+01 | 3.965600e+01 | 2.413800e+03 | 6.187300e+01 | 1.196300e+02 | 1.839600e-02 | ... | 3.118400e+01 | -2.718600e-01 | -4.893100e-03 | -9.941000e-02 | 0.000000e+00 | 2.280100e+01 | 3.492300e+01 | -3.635500e-01 | -2.809500e-01 | -5.327000e-03 |
| 25% | 2.191800e-01 | 3.636800e+03 | 4.470000e+03 | 9.266000e+00 | 2.676000e+01 | 4.218200e+01 | 2.697800e+03 | 7.444200e+01 | 1.203800e+02 | 3.234700e-01 | ... | 5.350800e+01 | 2.230300e+01 | 6.032300e+01 | 2.146100e+01 | 3.836600e+01 | 3.605000e+01 | 4.488100e+01 | 4.554300e+01 | 4.058800e+01 | 1.714200e+01 |
| 50% | 2.510900e-01 | 3.663300e+03 | 4.508500e+03 | 9.354600e+00 | 2.690100e+01 | 4.234800e+01 | 2.705600e+03 | 7.495500e+01 | 1.204000e+02 | 3.363500e-01 | ... | 5.397000e+01 | 2.517600e+01 | 6.148600e+01 | 2.213500e+01 | 3.997300e+01 | 3.814700e+01 | 4.634900e+01 | 4.804800e+01 | 4.119800e+01 | 1.828500e+01 |
| 75% | 2.823600e-01 | 3.690100e+03 | 4.547100e+03 | 9.452500e+00 | 2.704300e+01 | 4.252200e+01 | 2.714500e+03 | 7.543700e+01 | 1.204200e+02 | 3.484600e-01 | ... | 5.443200e+01 | 2.862500e+01 | 6.279000e+01 | 2.272400e+01 | 4.147700e+01 | 3.999700e+01 | 4.803700e+01 | 5.166200e+01 | 4.190600e+01 | 1.951200e+01 |
| max | 1.017500e+00 | 3.906700e+03 | 5.175800e+03 | 1.224000e+01 | 2.848600e+01 | 4.465300e+01 | 3.000500e+03 | 8.701700e+01 | 1.210100e+02 | 8.207300e-01 | ... | 1.000000e+02 | 1.001900e+02 | 1.000200e+02 | 1.001100e+02 | 9.752800e+01 | 5.312100e+01 | 5.998000e+01 | 1.005200e+02 | 1.003200e+02 | 1.000100e+02 |
8 rows × 52 columns
df.iloc[:,3:].var().sort_values()
xmeas_37 0.000104 xmeas_39 0.000175 xmeas_9 0.005080 xmeas_10 0.007052 xmeas_38 0.008152 xmeas_28 0.015529 xmeas_26 0.017567 xmeas_32 0.021203 xmeas_1 0.021348 xmeas_34 0.028962 xmeas_36 0.033107 xmeas_24 0.048654 xmeas_5 0.053392 xmeas_30 0.081770 xmeas_6 0.098138 xmeas_35 0.116158 xmeas_4 0.126988 xmeas_40 0.337786 xmeas_41 0.368914 xmeas_17 0.419413 xmeas_27 0.879099 xmeas_12 1.004801 xmeas_15 1.038325 xmeas_14 1.210292 xmeas_21 1.608266 xmeas_33 1.724893 xmeas_8 1.730527 xmeas_22 1.924037 xmeas_23 3.022118 xmeas_11 3.096841 xmeas_18 3.301689 xmeas_25 3.691014 xmv_8 5.561190 xmeas_29 6.819553 xmv_7 8.701986 xmeas_31 8.751433 xmv_1 10.689724 xmv_11 25.648027 xmv_2 26.346850 xmv_4 52.399593 xmv_10 95.518165 xmv_5 117.012888 xmeas_20 121.172139 xmv_6 159.421272 xmv_9 295.625258 xmv_3 401.560884 xmeas_2 1829.923008 xmeas_19 4615.665383 xmeas_7 5517.398028 xmeas_13 5603.044112 xmeas_16 5932.077347 xmeas_3 11815.655286 dtype: float64
# fig, ax = plt.subplots(13,4,figsize=(30,70))
# for i,column in enumerate(df.columns[3:]): #select all the feature columns to plot
# sns.kdeplot(data=df, x=column, hue="faultNumber",palette="hls",ax=ax.ravel()[i],fill=True, legend=False)
# # Load the data into a pandas DataFrame
# data = df[df['simulationRun']==1].iloc[:,3:]
# # Calculate the correlation matrix
# corr = data.corr()
# # Create a heatmap with annotations
# sns.set(style='white')
# mask = np.triu(np.ones_like(corr, dtype=bool))
# f, ax = plt.subplots(figsize=(50, 50))
# sns.heatmap(corr, mask=mask, cmap='coolwarm', annot=True, fmt='.2f',
# square=True, linewidths=.5, cbar_kws={"shrink": .5})
# plt.show()
# corr_matrix = data.corr()
# upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))
# to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.90)]
# print("No. of Columns to drop",len(to_drop))
# to_drop
No. of Columns to drop 14
['xmeas_13', 'xmeas_16', 'xmeas_25', 'xmeas_29', 'xmeas_31', 'xmeas_33', 'xmeas_34', 'xmeas_35', 'xmeas_36', 'xmv_6', 'xmv_7', 'xmv_8', 'xmv_9', 'xmv_11']
to_drop = ['xmeas_13',
'xmeas_16',
'xmeas_25',
'xmeas_29',
'xmeas_31',
'xmeas_33',
'xmeas_34',
'xmeas_35',
'xmeas_36',
'xmv_6',
'xmv_7',
'xmv_8',
'xmv_9',
'xmv_11']
def wrangle(df,drop,simulation):
reduced_data =df[(df['simulationRun'] >simulation[0] ) & (df['simulationRun'] <simulation[1] ) & (df['sample']>20)]
if drop != None:
reduced_data = reduced_data.drop(columns=to_drop, axis=1)
reduced_data = reduced_data[reduced_data['faultNumber'] != 3]
reduced_data = reduced_data[reduced_data['faultNumber'] != 9]
reduced_data = reduced_data[reduced_data['faultNumber'] != 15]
return reduced_data
reu_df = wrangle(df,drop=None,simulation=[1,20])
# reu_dft = wrangle(dft,drop=None,simulation=[1,4])
reu_df_drop = wrangle(df,drop=to_drop,simulation=[1,20])
# reu_dft_drop = wrangle(dft,drop=to_drop,simulation=[1,4])
sc = StandardScaler()
sc.fit(reu_df.iloc[:,3:])
X = sc.transform(reu_df.iloc[:,3:])
Y = reu_df['faultNumber']
le = LabelEncoder()
Y_enc = le.fit_transform(Y)
x_train, x_test, y_train, y_test = train_test_split(X, Y_enc, test_size=0.2, random_state=42)
len(reu_df.iloc[:,3:].columns)
52
type(Y_enc)
numpy.ndarray
X.shape
(155520, 52)
# def split(use_sklearn, reduced_data, sc):
# X = sc.transform(reduced_data.iloc[:,3:])
# Y = reduced_data['faultNumber']
# le = LabelEncoder()
# Y_enc = le.fit_transform(Y)
# if use_sklearn is not None:
# x_train, x_test, y_train, y_test = train_test_split(X, Y_enc, test_size=0.2, random_state=42)
# return x_train, x_test, y_train, y_test
# else:
# return X, Y_enc
# x_train, y_train = split(use_sklearn=None, reduced_data=reu_df, sc=sc)
# x_test, y_test = split(use_sklearn=None, reduced_data=reu_dft, sc=sc)
# x_train, x_test, y_train, y_test = split(use_sklearn="use", reduced_data=reu_df, sc=sc)
x_train
array([[-0.51094342, 1.69216389, 0.65944862, ..., 0.05511975,
-0.21888544, 0.22299588],
[ 0.2719047 , -0.18690039, -0.06191411, ..., -0.34199446,
0.34291703, -0.16850446],
[-0.09508277, -0.67893026, -0.13996881, ..., -0.55632036,
-0.11803242, -0.05413602],
...,
[ 0.16143878, -0.52753645, 0.5050547 , ..., -0.55264187,
0.34347474, 0.29873513],
[-0.40461292, -0.05999676, 0.33693687, ..., -0.08617739,
-0.04980558, -0.33756143],
[ 0.02891727, -0.27150281, -0.02588886, ..., -0.10229782,
-0.0949803 , -0.15721953]])
# joblib.dump(le, 'model/label_encoder.pkl')
# joblib.dump(sc, 'model/standard_scalar.pkl')
['model/standard_scalar.pkl']
print("Shape of x_train:", x_train.shape) # Assuming x_train is a NumPy array
print("Shape of y_train:", y_train.shape) # Assuming y_train is a NumPy array
print("Shape of x_test:", x_test.shape) # Assuming x_test is a NumPy array
print("Shape of y_test:", y_test.shape) # Assuming y_test is a NumPy array
Shape of x_train: (124416, 52) Shape of y_train: (124416,) Shape of x_test: (31104, 52) Shape of y_test: (31104,)
# # Create an instance of each algorithm
# logreg = LogisticRegression(max_iter=10000)
# svm = SVC()
# dt = DecisionTreeClassifier()
# rf = RandomForestClassifier()
# nb = GaussianNB()
# knn = KNeighborsClassifier()
# xg = xgb.XGBClassifier()
# # Train the algorithms on the data
# logreg.fit(x_train, y_train)
# # svm.fit(x_train, y_train)
# dt.fit(x_train, y_train)
# rf.fit(x_train, y_train)
# nb.fit(x_train, y_train)
# knn.fit(x_train, y_train)
# xg.fit(x_train, y_train)
# # Use the trained models to make predictions on new data
# y_pred_logreg =logreg.predict(x_test)
# # y_pred_svm = svm.predict(x_test)
# y_pred_dt = dt.predict(x_test)
# y_pred_rf = rf.predict(x_test)
# y_pred_nb = nb.predict(x_test)
# y_pred_knn = knn.predict(x_test)
# y_pred_xg = xg.predict(x_test)
# y_pred_logreg_train =logreg.predict(x_train)
# # y_pred_svm = svm.predict(x_test)
# y_pred_dt_train = dt.predict(x_train)
# y_pred_rf_train = rf.predict(x_train)
# y_pred_nb_train = nb.predict(x_train)
# y_pred_knn_train = knn.predict(x_train)
# y_pred_xg_train = xg.predict(x_train)
# # Create a function to plot the confusion matrix
# def plot_confusion_matrix(y_true, y_pred, title):
# f, ax = plt.subplots(figsize=(15, 15))
# cm = confusion_matrix(y_true, y_pred,normalize='true')
# sns.heatmap(cm, annot=True, cmap='Blues',ax=ax)
# ax.set_title(title)
# ax.set_xlabel('Predicted')
# ax.set_ylabel('Actual')
# ax.set_ylim(len(cm), 0)
# plt.tight_layout()
# plt.show()
# # Plot the confusion matrix and print the f1 score for each algorithm
# plot_confusion_matrix(y_test, y_pred_logreg, 'Logistic Regression Confusion Matrix')
# print("Logistic Regression accuracy_score:", accuracy_score(y_test, y_pred_logreg))
# # plot_confusion_matrix(y_test, y_pred_svm, 'SVM Confusion Matrix')
# # print("SVM accuracy_score:", accuracy_score(y_test, y_pred_svm))
# plot_confusion_matrix(y_test, y_pred_dt, 'Decision Tree Confusion Matrix')
# print("Decision Tree accuracy_score:", accuracy_score(y_test, y_pred_dt))
# plot_confusion_matrix(y_test, y_pred_rf, 'Random Forest Confusion Matrix')
# print("Random Forest accuracy_score:", accuracy_score(y_test, y_pred_rf))
# plot_confusion_matrix(y_test, y_pred_nb, 'Naive Bayes Confusion Matrix')
# print("Naive Bayes accuracy_score:", accuracy_score(y_test, y_pred_nb))
# plot_confusion_matrix(y_test, y_pred_knn, 'KNN Confusion Matrix')
# print("KNN accuracy_score:", accuracy_score(y_test, y_pred_knn))
# plot_confusion_matrix(y_test, y_pred_xg, 'Xgb Confusion Matrix')
# print("Xgb accuracy_score:", accuracy_score(y_test, y_pred_xg))
Logistic Regression accuracy_score: 0.5801183127572016
Decision Tree accuracy_score: 0.8238490226337448
Random Forest accuracy_score: 0.8959297839506173
Naive Bayes accuracy_score: 0.6526813271604939
KNN accuracy_score: 0.46405606995884774
Xgb accuracy_score: 0.925829475308642
# # Plot the confusion matrix and print the f1 score for each algorithm
# # plot_confusion_matrix(y_test, y_pred_logreg, 'Logistic Regression Confusion Matrix')
# print("Logistic Regression accuracy_score:", accuracy_score(y_train, y_pred_logreg_train))
# # plot_confusion_matrix(y_test, y_pred_svm, 'SVM Confusion Matrix')
# # print("SVM accuracy_score:", accuracy_score(y_test, y_pred_svm))
# # plot_confusion_matrix(y_test, y_pred_dt, 'Decision Tree Confusion Matrix')
# print("Decision Tree accuracy_score:", accuracy_score(y_train, y_pred_dt_train))
# # plot_confusion_matrix(y_test, y_pred_rf, 'Random Forest Confusion Matrix')
# print("Random Forest accuracy_score:", accuracy_score(y_train, y_pred_rf_train))
# # plot_confusion_matrix(y_test, y_pred_nb, 'Naive Bayes Confusion Matrix')
# print("Naive Bayes accuracy_score:", accuracy_score(y_train, y_pred_nb_train))
# # plot_confusion_matrix(y_test, y_pred_knn, 'KNN Confusion Matrix')
# print("KNN accuracy_score:", accuracy_score(y_train, y_pred_knn_train))
# # plot_confusion_matrix(y_test, y_pred_xg, 'Xgb Confusion Matrix')
# print("Xgb accuracy_score:", accuracy_score(y_train, y_pred_xg_train))
Logistic Regression accuracy_score: 0.5836387602880658 Decision Tree accuracy_score: 0.9859744727366255 Random Forest accuracy_score: 0.9859744727366255 Naive Bayes accuracy_score: 0.6517730838477366 KNN accuracy_score: 0.5820392875514403 Xgb accuracy_score: 0.9662422839506173
cv_acc_scores = cross_val_score(xg, x_test, y_test, cv=5, n_jobs=-1)
print(cv_acc_scores)
[0.9011413 0.89149654 0.89213953 0.89085356 0.88987138]
# Define the parameter grid for hyperparameter tuning
param_grid = {
'max_depth': [3, 5, 7],
'n_estimators': [100, 200, 300]
}
# 'colsample_bytree': [0.6, 0.8, 1.0],
# 'gamma': [0, 1, 5] 'learning_rate': [0.1, 0.01, 0.001],
# 'n_estimators': [100, 200, 300],
# 'subsample': [0.6, 0.8, 1.0]
# Define the cross-validation strategy
kfold = KFold(n_splits=2, shuffle=True, random_state=42)
# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier()
# Initialize the grid search object
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=kfold, n_jobs=-1,verbose=True)
# Fit the grid search object to the data
grid_result = grid_search.fit(x_train, y_train)
# Print the best hyperparameters and their corresponding score
print(f"Best score: {grid_result.best_score_:.4f}")
print("Best hyperparameters:")
for param, value in grid_result.best_params_.items():
print(f"{param}: {value}")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[56], line 13 2 param_grid = { 3 'max_depth': [3, 5, 7], 4 'n_estimators': [100, 200, 300] 5 } 7 # 'colsample_bytree': [0.6, 0.8, 1.0], 8 # 'gamma': [0, 1, 5] 'learning_rate': [0.1, 0.01, 0.001], 9 # 'n_estimators': [100, 200, 300], 10 # 'subsample': [0.6, 0.8, 1.0] 11 12 # Define the cross-validation strategy ---> 13 kfold = KFold(n_splits=2, shuffle=True, random_state=42) 15 # Initialize the XGBoost classifier 16 xgb_model = xgb.XGBClassifier() NameError: name 'KFold' is not defined
import lime
import lime.lime_tabular
lime_explainer = lime.lime_tabular.LimeTabularExplainer(x_train, feature_names=reu_df.columns[3:], class_names=le.classes_, discretize_continuous=True)
idx=34
y_pred = xg.predict(np.expand_dims(x_test[idx], axis=0))
# y_pred = enc.inverse_transform(y_pred)[0][0]
print(f"Predicted fault is {y_pred}")
exp = lime_explainer.explain_instance(x_test[idx], xg.predict_proba, num_features=10, top_labels=1)
note = exp.show_in_notebook(show_table=True, show_all=False)
Predicted fault is [15]
import shap
shap.initjs()
x_train = pd.DataFrame(x_train, columns=df.iloc[:,3:].columns)
x_test = pd.DataFrame(x_test, columns=df.iloc[:,3:].columns)
# explain the model's predictions using SHAP
explainer = shap.KernelExplainer(xg.predict_proba, x_train.iloc[::50])
idx_sample = 34
shap_values = explainer.shap_values(x_test.iloc[idx_sample])
Using 2489 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
type(shap.force_plot(explainer.expected_value[fault_cls], shap_values[fault_cls],sampled_test_data))
shap.plots._force.AdditiveForceVisualizer
sampled_test_data = x_test.iloc[idx_sample]
sampled_test_labels = y_test[idx_sample]
print(f"the predicted class is: {xg.predict([sampled_test_data])}")
cls_index = sampled_test_labels
print(f"the Actual class is: {cls_index}")
for fault_cls in range(0,18):
print(f"Force plot for fault class: {fault_cls}")
display(shap.force_plot(explainer.expected_value[fault_cls], shap_values[fault_cls],sampled_test_data ))
the predicted class is: [15] the Actual class is: 15 Force plot for fault class: 0
Force plot for fault class: 1
Force plot for fault class: 2
Force plot for fault class: 3
Force plot for fault class: 4
Force plot for fault class: 5
Force plot for fault class: 6
Force plot for fault class: 7
Force plot for fault class: 8
Force plot for fault class: 9
Force plot for fault class: 10
Force plot for fault class: 11
Force plot for fault class: 12
Force plot for fault class: 13
Force plot for fault class: 14
Force plot for fault class: 15
Force plot for fault class: 16
Force plot for fault class: 17
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(xg)
shap_values = explainer.shap_values(x_test.iloc[::50])
# ntree_limit is deprecated, use `iteration_range` or model slicing instead.
# x_test[::50]
# Visualize the SHAP values for a single example
sampled_test_data = x_test.iloc[::50]
sampled_test_labels = y_test[::50]
sample_idx = 45
print(f"the predicted class is: {xg.predict(sampled_test_data.iloc[[sample_idx]])}")
cls_index = sampled_test_labels[sample_idx]
print(f"the Actual class is: {cls_index}")
for fault_cls in range(0,19):
print(f"Force plot for fault class: {fault_cls}")
display(shap.force_plot(explainer.expected_value[fault_cls], shap_values[fault_cls][sample_idx],sampled_test_data.iloc[sample_idx] ))
[19:01:55] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0cec3277c4d9d0165-1\xgboost\xgboost-ci-windows\src\c_api\c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.
the predicted class is: [6] the Actual class is: 6 Force plot for fault class: 0
Force plot for fault class: 1
Force plot for fault class: 2
Force plot for fault class: 3
Force plot for fault class: 4
Force plot for fault class: 5
Force plot for fault class: 6
Force plot for fault class: 7
Force plot for fault class: 8
Force plot for fault class: 9
Force plot for fault class: 10
Force plot for fault class: 11
Force plot for fault class: 12
Force plot for fault class: 13
Force plot for fault class: 14
Force plot for fault class: 15
Force plot for fault class: 16
Force plot for fault class: 17
Force plot for fault class: 18
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[41], line 20 18 for fault_cls in range(0,19): 19 print(f"Force plot for fault class: {fault_cls}") ---> 20 display(shap.force_plot(explainer.expected_value[fault_cls], shap_values[fault_cls][sample_idx],sampled_test_data.iloc[sample_idx] )) IndexError: list index out of range
explainer = shap.TreeExplainer(xg)
F_num=10
df_new= df[(df['faultNumber']==F_num) & (df['simulationRun']==1)].copy()
x_new = df_new.iloc[:,3:]
x_new = sc.transform(x_new)
x_new = pd.DataFrame(x_new, columns=df_new.iloc[:,3:].columns)
[20:01:05] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0cec3277c4d9d0165-1\xgboost\xgboost-ci-windows\src\c_api\c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.
# Generate random numbers and append to x_new
# rand_nums = np.random.rand(len(x_new), 1)
# x_new = np.hstack((x_new, rand_nums))
# generate shapely values
shap_values_sample = explainer.shap_values(x_new)
y_pred = xg.predict(x_new)
# shap values for this fault class
data = shap_values_sample[le.transform([F_num])[0]]
y_pred = le.inverse_transform(y_pred)
f, ax = plt.subplots(figsize=(20,10))
sns.scatterplot(x=df_new['sample'],y=y_pred,palette="coolwarm",s=80,alpha=0.4)
plt.axvline(20,c='r',alpha=0.8,linestyle ='--')
plt.axhline(F_num,c='black',alpha=0.8,label='Actual Fault',linestyle ='--')
plt.legend(bbox_to_anchor=(1.5, 1.05))
plt.ylabel('Predicted fault class')
plt.title(f'Actual fault-{F_num}')
plt.yticks(np.arange(-1,21,1))
plt.show()
Ignoring `palette` because no `hue` variable has been assigned.
# create a figure and axis object
fig, ax = plt.subplots(figsize=(10,5))
# loop over the columns and plot each one
for i in range(data.shape[1]):
ax.plot(data[:, i], label=f'{sampled_test_data.columns[i]}')
# set the x-axis label and tick positions
ax.set_xlabel('Sample')
ax.set_xticks(np.arange(0, data.shape[0], 50))
# set the y-axis label
ax.set_ylabel('SHAP Value')
# add a legend
ax.legend(bbox_to_anchor=(1.5, 1.05))
# show the plot
plt.show()
# Create barplots for each row of the DataFrame
for i in range(len(Coeff_df)):
row_data = Coeff_df.iloc[i]
plt.figure(figsize=(25, 5))
sns.barplot(x=row_data.index, y=row_data.values)
plt.title(f"Fault class: {LogRegCls.classes_[i]}")
plt.xticks(rotation=90)
plt.show()
row_data = Coeff_df.abs().mean()
row_data = row_data.sort_values(ascending=False)
plt.figure(figsize=(25, 5))
sns.barplot(x=row_data.index, y=row_data.values)
plt.title(f"Combine feature Importance")
plt.xticks(rotation=90)
plt.show()
shap.summary_plot(shap_values,feature_names=reu_df.iloc[:,3:].columns, max_display=10)
import shap
import plotly.express as px
# Assuming you've already calculated SHAP values as shap_values
# and have the feature names as feature_names
# Create a SHAP summary plot using Plotly Express
fig = px.scatter(
x=shap_values[0], # SHAP values for the first instance
y=df.iloc[:,3:].columns, # Feature names
color=shap_values[0], # Color by SHAP values
labels={"x": "SHAP Value"},
title="SHAP Summary Plot",
)
fig.show()
importances = DT.feature_importances_
feature_name = np.array(reduced_data.columns[3:])
feature_name = np.append(feature_name,"control_feature")
# Create a DataFrame from the two arrays
feature_importance = pd.DataFrame({'feature_name': feature_name, 'importances': importances})
# Sort the DataFrame by importances in descending order
feature_importance = feature_importance.sort_values('importances', ascending=False)
plt.figure(figsize=(25, 5))
sns.barplot(x=feature_importance.feature_name , y=feature_importance.importances)
plt.title(f"Decision Tree feature Importance")
plt.xticks(rotation=90)
plt.show()
from sklearn import tree
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(DT,
feature_names=reduced_data.columns[3:],
filled=True)
In this series, we will use accuracy as the primary metric for evaluating the performance of the different machine learning algorithms. We will update the table as we evaluate the performance of other algorithms in the subsequently.
Average accuracy score obtained for each method, excluding fault No. 9 and 15 (No feature were Dropped, all 52 sensor measurements were used)
| Method | Accuracy |
|---|---|
| XG Boost | 0.887 |
| Random Forest | 0.943 |
for F_num in range(0,21):
df_new= df[(df['faultNumber']==F_num) & (df['simulationRun']==1)].copy()
x_new = df_new.iloc[:,3:]
x_new = sc.transform(x_new)
y_pred = xg.predict(x_new)
y_pred = le.inverse_transform(y_pred)
y_proba = np.max(xg.predict_proba(x_new),axis=1)
f, ax = plt.subplots(figsize=(10,3))
sns.scatterplot(x=df_new['sample'],y=y_pred,hue=y_proba,palette="coolwarm",s=80,alpha=0.4)
plt.axvline(20,c='r',alpha=0.8,linestyle ='--')
plt.axhline(F_num,c='cyan',alpha=0.8,label='Actual Fault',linestyle ='--')
plt.legend(bbox_to_anchor=(1.5, 1.05))
plt.ylabel('Predicted fault class')
plt.title(f'Actual fault-{F_num}')
plt.show()
Fault no 3,9,15,19,20
faultFree = DF[(DF['faultNumber']==0)&(DF['simulationRun']==1)]
faulty = DF[(DF['faultNumber']==9)&(DF['simulationRun']==1)]
fig, ax = plt.subplots(13,4,figsize=(30,70))
for i,col in enumerate(DF.iloc[:,3:]): #select all the feature columns to plot
ax.ravel()[i].plot(faultFree['sample'],faultFree[col],label='FaultFree')
ax.ravel()[i].plot(faulty['sample'],faulty[col],label='Faulty',alpha=0.6)
ax.ravel()[i].set_xlabel('sample no.')
ax.ravel()[i].set_ylabel(col)
ax.ravel()[i].legend()
Acc=[]
for Fnum in [x for x in range(0,21) if x not in [3,9,15]]:
temp_df = DF[ (DF['faultNumber']==Fnum) & (DF['sample']>20) & (DF['simulationRun']==np.random.randint(1,500))].copy()
X_faulty = sc.transform(temp_df.iloc[:,3:])
y_true =temp_df['faultNumber']
y_pred = xg.predict(X_faulty)
y_pred = le.inverse_transform(y_pred)
acc = accuracy_score(y_true, y_pred)
print(f'The accuracy for Fault_num={Fnum} is {acc}')
Acc.append(acc)
print('#'*20)
print(f'The overall average accuracy of XG boost is {np.mean(Acc)}')
joblib.dump(logreg, 'model/logistic_regression_model.pkl')
joblib.dump(dt, 'model/decision_tree_model.pkl')
joblib.dump(rf, 'model/random_forest_model.pkl')
joblib.dump(nb, 'model/naive_bayes_model.pkl')
joblib.dump(knn, 'model/knn_model.pkl')
joblib.dump(xg, 'model/xgboost_model.pkl')
['model/xgboost_model.pkl']
import joblib
logreg = joblib.load('model/logistic_regression_model.pkl')
dt = joblib.load('model/decision_tree_model.pkl')
rf = joblib.load('model/random_forest_model.pkl')
nb = joblib.load('model/naive_bayes_model.pkl')
knn = joblib.load('model/knn_model.pkl')
xg = joblib.load('model/xgboost_model.pkl')
sc = joblib.load('model/standard_scalar.pkl')
ls = ['0.3279', '3672.1', '4498.5', '9.3309', '26.882', '42.23', '2731.1', '75.143', '120.39', '0.32868', '80.305', '49.647', '2657.8', '24.689', '51.507', '3143.0', '23.191', '65.948', '234.64', '330.53', '94.875', '77.412', '31.077', '8.8385', '27.42', '7.0049', '18.594', '1.6378', '30.921', '13.635', '24.803', '1.2767', '18.439', '2.2702', '4.8982', '2.2933', '0.022881', '0.85299', '0.11356', '53.254', '44.275', '63.668', '54.306', '31.892', '61.533', '19.979', '40.044', '37.061', '50.021', '48.708', '40.319', '17.414']
ls1 = np.array(ls)
ls_enc = sc.transform([ls1])
c:\Users\Lenovo\anaconda3\Lib\site-packages\sklearn\base.py:465: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names warnings.warn(
ls_enc[0][1]
0.2249798121105276
xg.predict(ls_enc)[0]
1
import flask
dir(flask)
['Blueprint', 'Config', 'Flask', 'Markup', 'Request', 'Response', '__builtins__', '__cached__', '__doc__', '__file__', '__getattr__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', 'abort', 'after_this_request', 'app', 'appcontext_popped', 'appcontext_pushed', 'appcontext_tearing_down', 'before_render_template', 'blueprints', 'cli', 'config', 'copy_current_request_context', 'ctx', 'current_app', 'escape', 'flash', 'g', 'get_flashed_messages', 'get_template_attribute', 'globals', 'got_request_exception', 'has_app_context', 'has_request_context', 'helpers', 'json', 'jsonify', 'logging', 'make_response', 'message_flashed', 'redirect', 'render_template', 'render_template_string', 'request', 'request_finished', 'request_started', 'request_tearing_down', 'scaffold', 'send_file', 'send_from_directory', 'session', 'sessions', 'signals', 'signals_available', 'stream_template', 'stream_template_string', 'stream_with_context', 'template_rendered', 'templating', 'typing', 'url_for', 'wrappers']